While there are a small number of threatened species with a large range, it is clear that Range is likely a strong predictor of Group status
A lower range predicts a higher likelihood of threatened or extinct grouping.
corrDF <- train %>% mutate(Range=ntile(Range, n=20))
corrDF <- corrDF %>% mutate(across(c("Group","LF","GF","Range","Biomes","Range",
"Habitat_degradation","Habitat_loss",
"IAS","Other","Over_exploitation",
"Pollution","Unknown"),as_factor))
corrplot::corrplot(DescTools::PairApply(corrDF,DescTools::CramerV), type='lower') Raw Factor
1 Parasitic 1
2 Tree 2
3 Suffrutex 3
corrDF <- train %>% mutate(Range=ntile(Range, n=20))
corrDF <- corrDF %>% mutate(
across(
c("Group","LF","GF","Range",
"Biomes","Range",
"Habitat_degradation",
"Habitat_loss", "IAS","Other",
"Over_exploitation","Pollution",
"Unknown"),as_factor))
printFactors=matrix(c(train$GF[1],train$GF[2],
train$GF[3],corrDF$GF[1],
corrDF$GF[2],corrDF$GF[3]),
nrow=3)
colnames(printFactors)=c('Raw','Factor')
rownames(printFactors)=c('1','2','3')
print(printFactors,quote=FALSE)Group Counts Pre-Balancing: 490 148 23
Group Counts Post-Balancing: 490 490 490
AB <- data_train
AB <- AB[AB$label != '3',]
AB_res <- ovun.sample(label ~ ., data = AB,
method = "over", N = 980,
seed = 1)$data
AC <- data_train
AC <- AC[AC$label != '2',]
AC_res <- ovun.sample(label ~ ., data = AC,
method = "over", N = 980,
seed = 1)$data
AB_2 <- AB_res[AB_res$label == '2',]
AC_3 <- AC_res[AC_res$label == '3',]
data_train_1 <- AB_res[AB_res$label == '1',]
data_train_combined <- rbind(data_train_1, AB_2, AC_3)
cat("Group Counts Pre-Balancing: ",
table(data_train$label),
"\nGroup Counts Post-Balancing: ",
table(data_train_combined$label))\(X_{new}=\frac{X_{old}-\min(X_{old})}{\max(X_{old})-\min(x_{old})}\)
Score
Accuracy 0.93
Recall 0.87
Precision 0.89
F1 0.88
features_train_1 <- as.data.frame(lapply(features_train,
function(x) {(x-min(x))/(max(x)-min(x))}))
features_test_1 <- as.data.frame(lapply(features_test,
function(x) {(x-min(x))/(max(x)-min(x))}))
data_train_1 <- features_train_1
data_train_1$label <- label
class_counts_1 <- table(data_train_1$label)
model_1 <- randomForest(x = data_train_1[-ncol(data_train_combined)],
y = as.factor(data_train_1$label), ntree = 2)
variable_importance_1 = importance(model_1)
pred_comb_1 <- predict(model_1, features_test_1)
accuracy <- sum(label_test == pred_comb_1) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_1)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall',
'Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}-\bar{X}_{old}}{\sigma_{X_{old}}}\)
Score
Accuracy 0.33
Recall 0.40
Precision 0.31
F1 0.35
features_train_2 <- as.data.frame(lapply(features_train,
function(x) {(x - mean(x))/sd(x)}))
features_test_2 <- as.data.frame(lapply(features_test,
function(x) {(x - mean(x))/sd(x)}))
data_train_2 <- features_train_2
data_train_2$label <- label
class_counts_2 <- table(data_train_2$label)
model_2 <- randomForest(x = data_train_2[-ncol(data_train_combined)],
y = as.factor(data_train_2$label), ntree = 2)
variable_importance_2 = importance(model_2)
pred_comb_2 <- predict(model_2, features_test_2)
accuracy <- sum(label_test == pred_comb_2) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_2)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}}{\max(|X_{old}|)}\)
Score
Accuracy 0.88
Recall 0.80
Precision 0.79
F1 0.79
features_train_3 <- as.data.frame(lapply(features_train,
function(x) {x / max(abs(x))}))
features_test_3 <- as.data.frame(lapply(features_test,
function(x) {x / max(abs(x))}))
data_train_3 <- features_train_3
data_train_3$label <- label
class_counts_3 <- table(data_train_3$label)
model_3 <- randomForest(x = data_train_3[-ncol(data_train_combined)],
y = as.factor(data_train_3$label),
ntree = 2)
variable_importance_3 = importance(model_3)
pred_comb_3 <- predict(model_3, features_test_3)
accuracy <- sum(label_test == pred_comb_3) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_3)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}-\min(X_{old})}{\max(X_{old})-\min(x_{old})}\)
Score
Accuracy 0.77
Recall 0.60
Precision 0.55
F1 0.57
features_train_4 <- as.data.frame(lapply(features_train,
function(x) {x / sum(abs(x))}))
features_test_4 <- as.data.frame(lapply(features_test,
function(x) {x / sum(abs(x))}))
data_train_4 <- features_train_4
data_train_4$label <- label
class_counts_4 <- table(data_train_4$label)
model_4 <- randomForest(x = data_train_4[-ncol(data_train_combined)],
y = as.factor(data_train_4$label), ntree = 2)
variable_importance_4 = importance(model_4)
pred_comb_4 <- predict(model_4, features_test_4)
accuracy <- sum(label_test == pred_comb_4) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_4)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),
ncol=1,byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(X_{new}=\frac{X_{old}-\bar{X}_{old}}{\sigma_{X_{old}}}\)
Score
Accuracy 0.75
Recall 0.62
Precision 0.53
F1 0.57
features_train_5 <- as.data.frame(lapply(features_train,
function(x) {x / sqrt(sum(x^2))}))
features_test_5 <- as.data.frame(lapply(features_test,
function(x) {x / sqrt(sum(x^2))}))
data_train_5 <- features_train_5
data_train_5$label <- label
class_counts_5 <- table(data_train_5$label)
model_5 <- randomForest(x = data_train_5[-ncol(data_train_combined)],
y = as.factor(data_train_5$label), ntree = 2)
variable_importance_5 = importance(model_5)
pred_comb_5 <- predict(model_5, features_test_5)
accuracy <- sum(label_test == pred_comb_5) / length(label_test)
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_5)
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],
cm$byClass["Class: 2", "Sensitivity"],
cm$byClass["Class: 3", "Sensitivity"]))
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"],
cm$byClass["Class: 2", "Pos Pred Value"],
cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable) Score
Accuracy 0.90
Recall 0.82
Precision 0.76
F1 0.79
n <- length(pred_comb_1)
final_pred <- rep(NA, n)
for(i in 1:n) {
preds <- c(pred_comb_1[i], pred_comb_2[i], pred_comb_3[i],
pred_comb_4[i], pred_comb_5[i])
final_pred[i] <- as.numeric(names(which.max(table(preds))))
}
accuracy <- sum(label_test == final_pred) / length(label_test)
final_pred_factor <- as.factor(final_pred)
label_test_factor <- as.factor(label_test)
cm_vote <- confusionMatrix(final_pred_factor, label_test_factor)
sensitivity_class1 <- cm_vote$byClass["Class: 1", "Sensitivity"]
sensitivity_class2 <- cm_vote$byClass["Class: 2", "Sensitivity"]
sensitivity_class3 <- cm_vote$byClass["Class: 3", "Sensitivity"]
recall = (sensitivity_class1 + sensitivity_class2 + sensitivity_class3)/3
precision_class1 <- cm_vote$byClass["Class: 1", "Pos Pred Value"]
precision_class2 <- cm_vote$byClass["Class: 2", "Pos Pred Value"]
precision_class3 <- cm_vote$byClass["Class: 3", "Pos Pred Value"]
precision = (precision_class1 + precision_class2 + precision_class3)/3
F1 = 2 * recall * precision / ( recall + precision )
printTable=matrix(c(round(accuracy,2),round(recall,2),
round(precision,2),round(F1,2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Recall','Precision','F1')
print(printTable)\(\text{Accuracy}\)
\(\text{Recall}\)
\(\text{Precision}\)
\(\text{F1}\)
\(=\frac{\sum{\left(\text{Actual Label} = \text{Predicted Label}\right)}}{\text{Label Count}}\)
\(=\frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}\)
\(=\frac{\text{True Positives}}{\text{True Positives}+\text{False Positives}}\)
\(=\frac{2*(\text{Precision}*\text{Recall})}{\text{Precision}+\text{Recall}}\)
Score
Accuracy 0.9
Accuracy p-value <.001
95% CI (0.86,0.93)
Kappa 0.75
LC Thr Ext
Sensitivity 0.97 0.68 0.80
Specificity 0.84 0.97 0.97
Pos Pred Value 0.94 0.86 0.47
Neg Pred Value 0.91 0.91 0.99
Precision 0.94 0.86 0.47
Recall 0.97 0.68 0.80
F1 0.96 0.76 0.59
Prevalence 0.74 0.22 0.04
Detection Rate 0.72 0.15 0.03
Detection Prevalence 0.76 0.18 0.06
Balanced Accuracy 0.90 0.83 0.88
cm_vote <- confusionMatrix(final_pred_factor, label_test_factor)
cm <- confusionMatrix(final_pred_factor, label_test_factor)
cm_d <- as.data.frame(cm$table)
cm_st <-data.frame(cm$overall)
cm_st$cm.overall <- round(cm_st$cm.overall,2)
cm_d$diag <- cm_d$Prediction == cm_d$Reference
cm_d$ndiag <- cm_d$Prediction != cm_d$Reference
cm_d[cm_d == 0] <- NA
cm_d$Reference <- reverse.levels(cm_d$Reference)
cm_d$ref_freq <- cm_d$Freq * ifelse(is.na(cm_d$diag),-1,1)
plt1 <- ggplot(data = cm_d, aes(x = Prediction , y = Reference,
fill = Freq))+
scale_x_discrete(position = "top") +
geom_tile( data = cm_d,aes(fill = ref_freq)) +
scale_fill_gradient2(guide = FALSE ,low="red",high="mediumvioletred",
mid= "mistyrose",
midpoint = 0,na.value = 'white') +
geom_text(aes(label = Freq), color = 'black', size = 3)+
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.position = "none",
panel.border = element_blank(),
plot.background = element_blank(),
axis.line = element_blank(),
)
plt2 <- tableGrob(cm_st)
grid.arrange(plt1, plt2, nrow = 1, ncol = 2,
top=textGrob("Confusion Matrix",gp=gpar(fontsize=25,font=1)))printTable=matrix(c(round(cm$overall['Accuracy'],2),
if(cm$overall['AccuracyPValue']<0.001){"<.001"}
else round(cm$overall['AccuracyPValue'],3),
paste("(",round(cm$overall['AccuracyLower'],2),
",",round(cm$overall['AccuracyUpper'],2),
")",sep=""),
round(cm$overall['Kappa'],2)),ncol=1,
byrow=TRUE)
colnames(printTable)=c('Score')
rownames(printTable)=c('Accuracy','Accuracy p-value','95% CI','Kappa')
print(printTable,quote=FALSE)
cmBC<-cm$byClass
rownames(cmBC)<-c("LC","Thr","Ext")
print(t(round(cmBC,2)),quote=FALSE)